##############################
### SETUP ###
##############################
# Loading required libraries
library(Seurat)
library(harmony)
library(scDblFinder)
library(ggplot2)
library(dplyr)
set.seed(1)
options(future.globals.maxSize = 3 * 1024^3)
# QC thresholds initial
mt_max <- 20
nfeature_min <- 200
nfeature_max <- 6000
ncount_min <- 500
ncount_max <- 40000
cluster_res <- 0.8
# S gene signatures from paper (sullivan et al. 2024)
m2ts_genes <- c("MRC1","MS4A4A","CD36","CCL13","CCL18","CCL23","SLC38A6","FGL2","FN1","MAF")
m1ts_genes <- c("CCR7","IL2RA","CXCL11","CCL19","CXCL10","PLA1A","PTX3")
ctlts_genes <- c("GZMA","GZMB","GZMH","GZMM","PRF1")
dir.create("data_processed", showWarnings = FALSE)
dir.create("tables", showWarnings = FALSE)
dir.create("figures", showWarnings = FALSE)
Downloading the 10x matrices for GSE228598 from GEO. 10 of the 28 samples from the paper were publicly available.
##############################
### DATA DOWNLOAD ###
##############################
geo_dir <- "data/geo"
dir.create(geo_dir, recursive = TRUE, showWarnings = FALSE)
supp_url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE228598&format=file"
tar_file <- file.path(geo_dir, "GSE228598_RAW.tar")
# Only download if tar doesn't exist and no gz files present
gz_present <- length(list.files(geo_dir, pattern = "\\.gz$")) > 0
if (!file.exists(tar_file) && !gz_present) {
download.file(supp_url, tar_file, mode = "wb")
}
# Only extract if no gz files present
if (!gz_present) {
untar(tar_file, exdir = geo_dir)
}
merge into one Seurat object.
##############################
### LOAD & MERGE ###
##############################
# Reorganizing 10x triplets into per-sample dirs for Read10X
gz_files <- list.files(geo_dir, pattern = "barcodes\\.tsv\\.gz$", full.names = TRUE)
sample_ids <- sub(".*_(P\\d+)_barcodes.*", "\\1", basename(gz_files))
obj_list <- lapply(seq_along(sample_ids), function(i) {
sample_dir <- file.path(geo_dir, sample_ids[i])
dir.create(sample_dir, showWarnings = FALSE)
prefix <- sub("_barcodes\\.tsv\\.gz$", "_", basename(gz_files[i]))
for (suffix in c("barcodes.tsv.gz", "features.tsv.gz", "matrix.mtx.gz")) {
src <- file.path(geo_dir, paste0(prefix, suffix))
dst <- file.path(sample_dir, suffix)
if (!file.exists(dst)) file.copy(src, dst)
}
counts <- Read10X(sample_dir)
obj <- CreateSeuratObject(counts, project = sample_ids[i], min.cells = 3, min.features = 200)
obj$sample <- sample_ids[i]
obj
})
names(obj_list) <- sample_ids
merged <- merge(obj_list[[1]], obj_list[-1], add.cell.ids = sample_ids)
rm(obj_list)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 11429040 610.4 20222528 1080.0 NA 15987631 853.9
## Vcells 131491563 1003.3 342551921 2613.5 16384 308514726 2353.8
##############################
### QUALITY CONTROL ###
##############################
# Computing mito percentage
merged[["percent.mt"]] <- PercentageFeatureSet(merged, pattern = "^MT-")
# QC violin plots
VlnPlot(merged, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
group.by = "sample", pt.size = 0, ncol = 3) &
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# QC scatter plots
p1 <- FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
group.by = "sample", pt.size = 0.1) + NoLegend()
p2 <- FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "percent.mt",
group.by = "sample", pt.size = 0.1) + NoLegend()
p1 + p2
nCount has extreme outliers that squash the violin. The scatter shows most cells cluster in a tight nCount/nFeature band with a few high-mito stragglers. Standard cutoffs below.
# Capped violin for UMI distribution
VlnPlot(merged, features = "nCount_RNA", group.by = "sample", pt.size = 0) +
ylim(0, 50000) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Applying QC filters
merged <- subset(merged,
nFeature_RNA > nfeature_min & nFeature_RNA < nfeature_max &
nCount_RNA > ncount_min & nCount_RNA < ncount_max &
percent.mt < mt_max)
cat("After QC:", ncol(merged), "cells,", nrow(merged), "genes\n")
## After QC: 51742 cells, 16653 genes
table(merged$sample)
##
## P1 P10 P2 P3 P4 P5 P6 P7 P8 P9
## 8114 6378 6209 1896 6922 6483 4816 3945 4149 2830
Running scDblFinder per sample.
##############################
### DOUBLET REMOVAL ###
##############################
merged <- JoinLayers(merged)
sce <- as.SingleCellExperiment(merged)
sce <- scDblFinder(sce, samples = "sample")
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%
merged$doublet_class <- sce$scDblFinder.class
merged$doublet_score <- sce$scDblFinder.score
table(merged$doublet_class, merged$sample)
##
## P1 P10 P2 P3 P4 P5 P6 P7 P8 P9
## singlet 7166 5853 5657 1808 6278 5902 4447 3686 3865 2676
## doublet 948 525 552 88 644 581 369 259 284 154
merged <- subset(merged, doublet_class == "singlet")
cat("After doublet removal:", ncol(merged), "cells\n")
## After doublet removal: 47338 cells
rm(sce)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 11909723 636.1 20222528 1080.0 NA 20222528 1080.0
## Vcells 214575754 1637.1 607698195 4636.4 16384 607698073 4636.4
~8% doublet rate across samples, about what you’d expect for standard 10x loading density.
Using LogNormalize + Harmony for batch correction across the 10 patient samples.
Tried SCTransform but hit the 16GB memory limit Lol, and the paper used standard normalization anyway so proceeding with that
##############################
### NORMALIZATION & ###
### INTEGRATION ###
##############################
# Saving QC checkpoint
saveRDS(merged, "data_processed/gse228598_merged_qc.rds")
# Standard log-normalization pipeline
merged <- NormalizeData(merged)
merged <- FindVariableFeatures(merged, nfeatures = 2000)
merged <- ScaleData(merged)
merged <- RunPCA(merged, verbose = FALSE)
# Elbow plot for PC selection
ElbowPlot(merged, ndims = 40)
Elbow flattens around PC 25-30, going with 30.
# Harmony batch correction
merged <- RunHarmony(merged, group.by.vars = "sample", reduction = "pca", reduction.save = "harmony")
merged <- RunUMAP(merged, reduction = "harmony", dims = 1:30, verbose = FALSE)
merged <- FindNeighbors(merged, reduction = "harmony", dims = 1:30, verbose = FALSE)
merged <- FindClusters(merged, resolution = cluster_res, verbose = FALSE)
# UMAP colored by cluster and by sample
p1 <- DimPlot(merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE) + NoLegend()
p2 <- DimPlot(merged, reduction = "umap", group.by = "sample")
p1 + p2
ggsave("figures/umap_harmony_clusters.png", p1 + p2, width = 10, height = 4)
Samples mix well in the central mass, Harmony did its job like always, long live korsunsky lab. Isolated clusters on the right are patient-enriched, likely epithelial/mesothelial populations.
Assigning clusters to compartments using canonical marker expression.
##############################
### COMPARTMENT ###
### ANNOTATION ###
##############################
broad_markers <- c(
"PTPRC",
"CD3D", "CD3E", "NKG7",
"CD79A", "MS4A1", "MZB1", "JCHAIN",
"CD14", "CD68", "LYZ", "FCGR3A",
"EPCAM", "KRT18", "KRT19",
"PECAM1", "VWF",
"COL1A1", "DCN",
"MSLN", "CALB2"
)
DotPlot(merged, features = broad_markers, cluster.idents = TRUE) +
RotatedAxis()
# DEGs per cluster to back up the compartment calls
markers <- FindAllMarkers(merged, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5)
write.csv(markers, "tables/cluster_markers_all.csv", row.names = FALSE)
# Top 2 markers per cluster
top2 <- markers %>% group_by(cluster) %>% slice_max(avg_log2FC, n = 2)
DoHeatmap(merged, features = top2$gene, size = 3, cells = WhichCells(merged, downsample = 100)) +
NoLegend() +
theme(axis.text.y = element_text(size = 7))
ggsave("figures/heatmap_cluster_markers.png", width = 16, height = 8)
Clusters separate cleanly into the expected lineages. Myeloid clusters (0, 1, 3, 4, 8, 12, 14, 22) light up for LYZ/CD14/CD68. T/NK clusters have CD3D/NKG7. Epithelial clusters express keratins. Cluster 14 is a bit ambiguous – could be pDC or activated monocytes, but grouping with myeloid for now.
# Mapping clusters to broad compartments
compartment_map <- c(
"0" = "Myeloid", "1" = "Myeloid", "3" = "Myeloid", "4" = "Myeloid",
"8" = "Myeloid", "12" = "Myeloid", "14" = "Myeloid", "22" = "Myeloid",
"2" = "T/NK", "5" = "T/NK", "6" = "T/NK", "7" = "T/NK",
"9" = "T/NK", "17" = "T/NK", "18" = "T/NK", "19" = "T/NK", "21" = "T/NK",
"11" = "B cell", "15" = "Plasma",
"13" = "Epithelial", "16" = "Epithelial", "20" = "Epithelial",
"10" = "Mesothelial", "23" = "Mesothelial"
)
merged$compartment <- unname(compartment_map[as.character(merged$seurat_clusters)])
table(merged$compartment)
##
## B cell Epithelial Mesothelial Myeloid Plasma T/NK
## 1318 1562 1368 26921 275 15894
DimPlot(merged, group.by = "compartment", label = TRUE) + NoLegend()
ggsave("figures/umap_broad_compartments.png", width = 6, height = 5)
Myeloid dominant, T/NK second. Epithelial and mesothelial are mostly patient-specific as expected for peritoneal fluid.
# Exporting cell counts
write.csv(as.data.frame.matrix(table(merged$sample, merged$compartment)),
"tables/broad_compartment_counts.csv")
write.csv(table(merged$sample, merged$compartment), "tables/cell_counts_by_sample.csv")
Subsetting the myeloid compartment, re-clustering, and flagging M2 macrophages as CD68+CD163+ per Sullivan et al.
##############################
### MACROPHAGE SUBSET ###
##############################
macrophages <- subset(merged, compartment == "Myeloid")
macrophages <- NormalizeData(macrophages)
macrophages <- FindVariableFeatures(macrophages, nfeatures = 2000)
macrophages <- ScaleData(macrophages)
macrophages <- RunPCA(macrophages, verbose = FALSE)
macrophages <- RunHarmony(macrophages, group.by.vars = "sample", reduction = "pca", reduction.save = "harmony")
macrophages <- RunUMAP(macrophages, reduction = "harmony", dims = 1:20, verbose = FALSE)
macrophages <- FindNeighbors(macrophages, reduction = "harmony", dims = 1:20, verbose = FALSE)
# tried res 0.4 and 0.8 too, 0.6 gives cleanest separation
macrophages <- FindClusters(macrophages, resolution = 0.6, verbose = FALSE)
# Flagging CD68+CD163+ M2 macrophages
cd68_expr <- GetAssayData(macrophages, layer = "data")["CD68", ]
cd163_expr <- GetAssayData(macrophages, layer = "data")["CD163", ]
macrophages$m2_flag <- ifelse(cd68_expr > 0 & cd163_expr > 0, "CD68+CD163+", "Other")
table(macrophages$m2_flag)
##
## CD68+CD163+ Other
## 15148 11773
p1 <- DimPlot(macrophages, label = TRUE) + NoLegend() + ggtitle("Myeloid clusters")
p2 <- DimPlot(macrophages, group.by = "m2_flag") + ggtitle("CD68+CD163+ M2")
p1 + p2
Over half the myeloid cells are CD68+CD163+. They cluster together on the UMAP rather than scattering randomly, which suggests M2 polarization is a real transcriptional state here, not just noise from dropout.
Scoring all macrophages for the three Sullivan et al. signatures.
##############################
### MODULE SCORES ###
##############################
macrophages <- AddModuleScore(macrophages, features = list(m2ts_genes), name = "M2TS")
macrophages <- AddModuleScore(macrophages, features = list(m1ts_genes), name = "M1TS")
macrophages <- AddModuleScore(macrophages, features = list(ctlts_genes), name = "CTLTS")
saveRDS(macrophages, "data_processed/gse228598_macrophages.rds")
FeaturePlot(macrophages, features = c("M2TS1", "M1TS1", "CTLTS1"), ncol = 3, order = TRUE) &
scale_color_viridis_c()
ggsave("figures/umap_m2ts_macrophage_subset.png", width = 10, height = 4)
VlnPlot(macrophages, features = "M2TS1", group.by = "m2_flag", pt.size = 0) +
ggtitle("M2TS score by CD68+CD163+ status")
ggsave("figures/violin_m2ts_by_m2flag.png", width = 6, height = 4)
# Statistical test for M2TS enrichment
wt <- wilcox.test(M2TS1 ~ m2_flag, data = macrophages@meta.data)
cat("Wilcoxon rank-sum test: W =", wt$statistic, ", p =", format.pval(wt$p.value, digits = 3), "\n")
## Wilcoxon rank-sum test: W = 112193390 , p = <2e-16
M2TS tracks with the CD68+CD163+ phenotype as expected (Wilcoxon p < 2.2e-16). This replicates the paper’s core finding from their Figure 3A. M1TS and CTLTS are low in macrophages which makes sense since those are T-cell signatures.
##############################
### SUMMARY STATS ###
##############################
m2_cells <- subset(macrophages, m2_flag == "CD68+CD163+")
m2ts_by_sample <- m2_cells@meta.data |>
group_by(sample) |>
summarise(
n_m2_cells = n(),
mean_m2ts = mean(M2TS1),
median_m2ts = median(M2TS1),
.groups = "drop"
)
write.csv(m2ts_by_sample, "tables/m2ts_summary_by_sample.csv", row.names = FALSE)
m2ts_by_sample
## # A tibble: 10 × 4
## sample n_m2_cells mean_m2ts median_m2ts
## <chr> <int> <dbl> <dbl>
## 1 P1 1147 -0.0762 -0.103
## 2 P10 3334 0.0277 0.0328
## 3 P2 490 -0.129 -0.138
## 4 P3 541 -0.0862 -0.0903
## 5 P4 2221 0.00603 0.00700
## 6 P5 1368 -0.0906 -0.101
## 7 P6 1187 -0.0605 -0.0723
## 8 P7 1490 0.164 0.173
## 9 P8 2254 0.169 0.167
## 10 P9 1116 -0.0234 -0.0275
M2TS varies a lot across patients. P7 and P8 are the highest, P2 is the lowest. The paper used this kind of inter-patient heterogeneity to stratify into M2TS-high vs M2TS-low groups.
# Scoring all cells for cross-compartment comparison
merged <- AddModuleScore(merged, features = list(m2ts_genes), name = "M2TS")
merged <- AddModuleScore(merged, features = list(m1ts_genes), name = "M1TS")
merged <- AddModuleScore(merged, features = list(ctlts_genes), name = "CTLTS")
m2ts_by_celltype <- merged@meta.data |>
group_by(compartment) |>
summarise(
n_cells = n(),
mean_m2ts = mean(M2TS1),
mean_m1ts = mean(M1TS1),
mean_ctlts = mean(CTLTS1),
.groups = "drop"
)
write.csv(m2ts_by_celltype, "tables/m2ts_summary_by_celltype.csv", row.names = FALSE)
m2ts_by_celltype
## # A tibble: 6 × 5
## compartment n_cells mean_m2ts mean_m1ts mean_ctlts
## <chr> <int> <dbl> <dbl> <dbl>
## 1 B cell 1318 -0.245 0.0814 -0.109
## 2 Epithelial 1562 -0.309 -0.0465 -0.198
## 3 Mesothelial 1368 -0.119 -0.0705 -0.260
## 4 Myeloid 26921 0.0959 -0.0144 -0.208
## 5 Plasma 275 -0.283 -0.0208 0.124
## 6 T/NK 15894 -0.219 0.0330 0.366
Signatures land where they should – M2TS highest in myeloid, CTLTS highest in T/NK. Sanity check passed.
Using the Azimuth PBMC reference to validate marker-based annotation and get finer-grained immune subtypes. These labels get saved so Part 2 can use them for CellChat.
##############################
### AZIMUTH MAPPING ###
##############################
library(Azimuth)
immune <- subset(merged, compartment %in% c("Myeloid", "T/NK", "B cell", "Plasma"))
immune <- RunAzimuth(immune, reference = "pbmcref")
table(immune$predicted.celltype.l1)
##
## B CD4 T CD8 T DC Mono NK other other T
## 1521 6254 4054 3133 27110 855 437 1044
table(immune$predicted.celltype.l2)
##
## B intermediate B memory B naive CD14 Mono
## 221 341 817 21587
## CD16 Mono CD4 CTL CD4 Naive CD4 Proliferating
## 5365 84 107 305
## CD4 TCM CD4 TEM CD8 Naive CD8 Proliferating
## 4994 564 305 43
## CD8 TCM CD8 TEM cDC1 cDC2
## 339 3378 145 3025
## dnT gdT HSPC ILC
## 209 623 1 56
## MAIT NK NK Proliferating NK_CD56bright
## 288 631 86 134
## pDC Plasmablast Platelet Treg
## 81 140 390 149
Azimuth breaks the myeloid compartment into mostly CD14 Mono and CD16 Mono with a smaller cDC population. T cells resolve into CD4 TCM, CD8 TEM, MAIT, gdT, Treg subsets – good granularity for CellChat.
# Cross-tabulating marker-based vs Azimuth labels
cross_tab <- table(
marker = immune$compartment,
azimuth = immune$predicted.celltype.l1
)
cross_tab
## azimuth
## marker B CD4 T CD8 T DC Mono NK other other T
## B cell 1289 4 6 2 13 2 1 1
## Myeloid 3 91 11 409 26067 0 335 5
## Plasma 177 7 2 79 7 1 2 0
## T/NK 52 6152 4035 2643 1023 852 99 1038
write.csv(as.data.frame.matrix(cross_tab), "tables/marker_vs_azimuth_table.csv")
Strong concordance overall. Myeloid maps almost entirely to Mono, B cells map cleanly to B. The main discrepancy is a chunk of our T/NK cells that Azimuth calls DC – probably pDCs sitting at the boundary. Not worrying about it for now.
# Transferring Azimuth labels back to the full merged object
merged$azimuth_l1 <- NA
merged$azimuth_l2 <- NA
overlap <- intersect(Cells(immune), Cells(merged))
merged$azimuth_l1[match(overlap, Cells(merged))] <- immune$predicted.celltype.l1[match(overlap, Cells(immune))]
merged$azimuth_l2[match(overlap, Cells(merged))] <- immune$predicted.celltype.l2[match(overlap, Cells(immune))]
# Saving with all annotations (compartment + Azimuth) for Part 2
saveRDS(merged, "data_processed/gse228598_harmony.rds")